package com.etc;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * @Auther: Wangcc
 * @Date: 2018/8/23 10:25
 * @Description: Spark SQL：JSON数据源复杂综合案例实战
 */
public class JSONDataSource {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("JSONDataSource").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        //针对json文件，创建Dataset<Row>（针对json文件创建DataFrame）
        Dataset<Row> json = sqlContext.read().json("f:\\students.json");

        // 针对学生成绩信息的DataFrame，注册临时表，查询分数大于80分的学生的姓名
        // （注册临时表，针对临时表执行sql语句）

        json.registerTempTable("students");

        Dataset<Row> studentsRow = sqlContext.sql("select * from students where score>=80");

        List<String> collect = studentsRow.javaRDD().map(new Function<Row, String>() {
            @Override
            public String call(Row row) throws Exception {

                return row.getString(0);
            }
        }).collect();

        for (String s : collect) {
            System.out.println(s);
        }

        // 然后针对JavaRDD<String>，创建DataFrame
        // （针对包含json串的JavaRDD，创建DataFrame）
        List<String> studentInfoJSONs = new ArrayList<String>();
        studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}");
        studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}");
        studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}");
        JavaRDD<String> parallelize = sc.parallelize(studentInfoJSONs);

        Dataset<Row> json1 = sqlContext.read().json(parallelize);

        json1.registerTempTable("student_infos");

        String sql = "select name,age from student_infos where name in (";
        for(int i = 0; i < collect.size(); i++) {
            sql += "'" + collect.get(i) + "'";
            if(i < collect.size() - 1) {
                sql += ",";
            }
        }
        sql += ")";

        Dataset<Row> goodStudentInfosDF = sqlContext.sql(sql);

        // 然后将两份数据的DataFrame，转换为JavaPairRDD，执行join transformation
        // （将DataFrame转换为JavaRDD，再map为JavaPairRDD，然后进行join）


        JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD = goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1))));
            }
        }).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0),
                        Integer.valueOf(String.valueOf(row.getLong(1))));
            }
        }));

        // 然后将封装在RDD中的好学生的全部信息，转换为一个JavaRDD<Row>的格式
        // （将JavaRDD，转换为DataFrame）


        JavaRDD<Row> goodStudentRowsRDD = goodStudentsRDD.map(
                new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
            @Override
            public Row call(Tuple2<String, Tuple2<Integer, Integer>> t) throws Exception {
                return RowFactory.create(t._1,t._2._1,t._2._2);
            }
        });

        // 创建一份元数据，将JavaRDD<Row>转换为DataFrame
        List<StructField> structFields = new ArrayList<StructField>();
        structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
        structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
        structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
        StructType structType = DataTypes.createStructType(structFields);

        Dataset<Row> dataFrame = sqlContext.createDataFrame(goodStudentRowsRDD, structType);
//       dataFrame.write().format("json").save("f:\\A.txt");
        dataFrame.show();

    }
}
